{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# coding: utf-8\n",
    "import calendar\n",
    "from datetime import datetime\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "from sklearn.ensemble import BaggingRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 辅助函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def addDateInterval(date):\n",
    "    \"\"\"\n",
    "    返回当前日期为从2000年1月开始的第几个月\n",
    "    :dat str\n",
    "        'YYYY-MM-DD'\n",
    "    :return int\n",
    "    \"\"\"\n",
    "    if date == '-1':\n",
    "        return -1\n",
    "    _date = pd.to_datetime(date)\n",
    "    if _date.year < 2000:\n",
    "        return -1\n",
    "    month_interval = (_date.year - 2000) * 12 + _date.month - 1\n",
    "    return month_interval\n",
    "\n",
    "\n",
    "def addVotersDiscretization(x):\n",
    "    \"\"\"\n",
    "    对voters属性进行6级离散化处理，缺失值为-1\n",
    "    :x int\n",
    "    :return int\n",
    "    \"\"\"\n",
    "    if x > -1:\n",
    "        if x < 100:\n",
    "            return 0\n",
    "        elif x < 500:\n",
    "            return 1\n",
    "        elif x < 1000:\n",
    "            return 2\n",
    "        elif x < 2500:\n",
    "            return 3\n",
    "        elif x < 10000:\n",
    "            return 4\n",
    "        else:\n",
    "            return 5\n",
    "    return -1\n",
    "\n",
    "\n",
    "def addMeanPrice(product_quantity, product_info):\n",
    "    \"\"\"\n",
    "    添加平均售价\n",
    "    \"\"\"\n",
    "    new_pq = product_quantity[product_quantity.price != -1].copy()\n",
    "    new_pq['sum'] = new_pq['price'] * new_pq['ciiquantity']\n",
    "    sumc = new_pq.groupby('product_id')['ciiquantity'].sum()\n",
    "    product_info.loc[product_info.index.isin(new_pq.product_id.unique()), 'price'] = (\n",
    "        new_pq.groupby('product_id')['sum'].sum() / sumc)\n",
    "    product_info.fillna(-1, inplace=True)\n",
    "    return product_info\n",
    "\n",
    "\n",
    "def addEval0(col):\n",
    "    \"\"\"\n",
    "    根据评分人数和客户评分计算新的评级特征\n",
    "    :col ['voters','eval3']\n",
    "    :reuturn int\n",
    "    \"\"\"\n",
    "    voters = col[0]\n",
    "    eval3 = col[1]\n",
    "    if voters < 0:\n",
    "        return -1\n",
    "    if voters > 2:\n",
    "        if eval3 > 4:\n",
    "            return 2\n",
    "        elif eval3 < 3:\n",
    "            return 0\n",
    "    return 1\n",
    "\n",
    "\n",
    "def transformProductInfo(product_info, product_quantity):\n",
    "    product_info['index'] = product_info['product_id']\n",
    "    product_info.set_index('index', inplace=True)\n",
    "    product_info['startdate'] = product_info['startdate'].apply(\n",
    "        addDateInterval)\n",
    "    product_info['upgradedate'] = product_info['upgradedate'].apply(\n",
    "        addDateInterval)\n",
    "    product_info['cooperatedate'] = product_info['cooperatedate'].apply(\n",
    "        addDateInterval)\n",
    "\n",
    "    # 订单属性1 我们发现订单属性1对于产品来说其实是一个唯一属性,将其添加进info表\n",
    "    oa1 = product_quantity.drop_duplicates('product_id')\n",
    "    product_info.loc[oa1['product_id'],\n",
    "                     'orderattribute1'] = oa1['orderattribute1'].tolist()\n",
    "    product_info.fillna(-1, inplace=True)\n",
    "\n",
    "    product_info['voters'] = product_info['voters'].apply(\n",
    "        addVotersDiscretization)\n",
    "    product_info = addMeanPrice(product_quantity, product_info)\n",
    "    product_info['eval0'] = product_info[[\n",
    "        'voters', 'eval3']].apply(addEval0, axis=1)\n",
    "\n",
    "    return product_info\n",
    "\n",
    "\n",
    "def change_dat2(col):\n",
    "    \"\"\"\n",
    "    计算当前月份到产品开售月份(start_date)的月数\n",
    "    : col   ['startdate','year','month']\n",
    "    : return int\n",
    "    \"\"\"\n",
    "    start_date, year, month = col\n",
    "    current_date = addDateInterval('-'.join([str(year), str(month).zfill(2)]))\n",
    "    if start_date > current_date:\n",
    "        return -1\n",
    "    return current_date - start_date\n",
    "\n",
    "\n",
    "def get_holiday(col):\n",
    "    '''\n",
    "    计算指定年月的假期天数\n",
    "    : col ['year','month']\n",
    "    : return int\n",
    "    '''\n",
    "    holiday = [9, 11, 10, 9, 10, 10, 8, 10, 8, 12, 10, 8,\n",
    "               10, 11, 9, 9, 11, 8, 8, 10, 9, 13, 9, 8,\n",
    "               11, 11, 8, 10, 10, 9, 10, 8, 9, 13, 8, 9,\n",
    "               12]\n",
    "    year, month = col\n",
    "    return holiday[(year - 2014) * 12 + month - 1]\n",
    "\n",
    "\n",
    "def get_x(quantity, product_info):\n",
    "    \"\"\"\n",
    "    根据product_id product_month 生成训练数据集\n",
    "    : quantity dataframe\n",
    "    : product_info datafrmae\n",
    "    : return dataframe\n",
    "    \"\"\"\n",
    "    x = product_info.loc[quantity['product_id']].copy()\n",
    "    x.reset_index(drop=True, inplace=True)\n",
    "    x['year'] = quantity['product_month'].str[:4].apply(int)\n",
    "    x['month'] = quantity['product_month'].str[5:7].apply(int)\n",
    "    x['startdate'] = x[['startdate', 'year', 'month']].apply(\n",
    "        change_dat2, axis=1)\n",
    "    x['upgradedate'] = x[['upgradedate', 'year', 'month']].apply(\n",
    "        change_dat2, axis=1)\n",
    "    x['cooperatedate'] = x[['cooperatedate', 'year', 'month']].apply(\n",
    "        change_dat2, axis=1)\n",
    "    x['holiday'] = x[['year', 'month']].apply(get_holiday, axis=1)\n",
    "    x = pd.get_dummies(x, columns=['month'])\n",
    "    x['month79'] = 1 - (1 - x['month_7']) * (1 - x['month_9'])\n",
    "\n",
    "    return x\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_path = './'\n",
    "product_quantity = pd.read_csv(data_path + 'product_quantity.txt')\n",
    "product_info = pd.read_csv(data_path + 'product_info.txt',)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "product_info = transformProductInfo(product_info, product_quantity)\n",
    "\n",
    "# 将所有商品每个月的销量转化为一个37*4001的数组供之后处理负数预测值的时候使用\n",
    "product_quantity['product_date'] = product_quantity['product_date'].str[:7]\n",
    "quantity = product_quantity.groupby(['product_id', 'product_date']).sum()\n",
    "quantity.reset_index(inplace=True)\n",
    "\n",
    "quantity_arr = np.full((37, 4001), -1, dtype=np.int32)\n",
    "for idx in quantity.index:\n",
    "    pid = quantity.loc[idx, 'product_id']\n",
    "    date = quantity.loc[idx, 'product_date']\n",
    "    quantity_arr[(int(date[:4]) - 2014) * 12 + int(date[5:7]) -\n",
    "                 1][pid] = quantity.loc[idx, 'ciiquantity']\n",
    "\n",
    "train_y = quantity['ciiquantity'].tolist()\n",
    "\n",
    "quantity.rename(columns={'product_date': 'product_month'}, inplace=True)\n",
    "train_x = get_x(quantity, product_info)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training start\n",
      "training  done\n"
     ]
    }
   ],
   "source": [
    "print('training start')\n",
    "num_clfs = 4\n",
    "clfs = []\n",
    "for i in range(num_clfs):\n",
    "    clf = BaggingRegressor(lgb.sklearn.LGBMRegressor(max_depth=10, n_estimators=2000, num_leaves=80 + 10 * i,),\n",
    "                           n_estimators=6, random_state=i, n_jobs=1, max_samples=0.8 + 0.001 * i,)\n",
    "    clf.fit(train_x, train_y,)\n",
    "    clfs.append(clf)\n",
    "print('training  done')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 执行预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predicting start\n"
     ]
    }
   ],
   "source": [
    "print('predicting start')\n",
    "out = pd.read_csv(data_path +'prediction_lilei_20170320.txt')\n",
    "test_x = get_x(out[['product_id', 'product_month']], product_info)\n",
    "ans = [clf.predict(test_x) for clf in clfs]\n",
    "# 23个月均无数据的product_id\n",
    "invalid_pid = [i for i in range(1, 4001) if quantity_arr[:23, i].max() < 0]\n",
    "# 将负数的预测改为前23个月有效的最小值,若无则为0\n",
    "history_min = [0] * 4001\n",
    "for i in range(1, 4001):\n",
    "    quantity_i = quantity_arr[:23, i]\n",
    "    if quantity_i.max() < 0:\n",
    "        history_min[i] = 0\n",
    "    else:\n",
    "        history_min[i] = quantity_i[quantity_i > -1].min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 生成最终提交结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predicting done\n"
     ]
    }
   ],
   "source": [
    "final_out = out.copy()\n",
    "final_out['ciiquantity_month'] = 0\n",
    "for pred_y in ans:\n",
    "    out['ciiquantity_month'] = pred_y\n",
    "    # 将23个月均无数据且startdate,cooperatedate均在第24个月以后的清0\n",
    "    for pid in invalid_pid:\n",
    "        product_info_i = product_info.loc[pid]\n",
    "        dat = min(product_info_i['startdate'],\n",
    "                  product_info_i['cooperatedate']) - 191 + 23\n",
    "        for j in range(23, dat.astype(np.int)):\n",
    "            out.loc[(j - 23) * 4000 + pid - 1, 'ciiquantity_month'] = 0\n",
    "\n",
    "    idx = out['ciiquantity_month'] < 0\n",
    "    out.loc[idx, 'ciiquantity_month'] = np.array(\n",
    "        history_min)[out.loc[idx, 'product_id']]\n",
    "    final_out['ciiquantity_month'] += out['ciiquantity_month']\n",
    "out['ciiquantity_month'] = final_out['ciiquantity_month'] / len(ans)\n",
    "\n",
    "out.to_csv('l_bg46_lgb100_-1first.txt', index=False)\n",
    "print('predicting done')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
